# Baseado em Gertler et al. (2018), capítulo 7
# http://hdl.handle.net/10986/25030

if (! "haven" %in% installed.packages()) install.packages("haven", dep = T) # for read_dta
if (! "data.table" %in% installed.packages()) install.packages("haven", dep = T) # for as.data.table
if (! "Hmisc" %in% installed.packages()) install.packages("haven", dep = T) # for describe
if (! "sandwich" %in% installed.packages()) install.packages("sandwich", dep = T) # for vcov
if (! "lmtest" %in% installed.packages()) install.packages("lmtest", dep = T) # for coeftest

library(haven)
library(data.table)
library(Hmisc)
library(sandwich)
library(lmtest)

rm()

setwd("./")

# Read data from Gertler et al. (2018) and generate dataframe: data
dados <- as.data.frame(read_stata("evaluation.dta"))
View(dados) # To access variable labels
head(dados)

# Keep only localities where HISP has been offered, generate dataframe df2
# But first table column treatment_locality
describe(dados$treatment_locality)
as.data.table(table(dados$treatment_locality))

dim(dados)
df2 = dados[dados$treatment_locality==1,]
dim(df2)

# Table dummy enrolled: our treatment variable
describe(df2$enrolled)
as.data.table(table(df2$enrolled))

table(df2$eligible, df2$enrolled) # in this particular dataset, all eligible households, and only eligible households, enrolled

# Estimate diff-in-diff in a regression framework
# In this method, you compare the change in health expenditures over time
# between enrolled and nonenrolled households in the treatment localities
reg1 = lm(health_expenditures ~ enrolled*round, data = df2)
summary(reg1)

# Now, redo the last estimation, but this time with clustered standard errors at the locality level
# To learn more about clustered standard errors:
# https://en.wikipedia.org/wiki/Clustered_standard_errors
# http://www.nber.org/papers/w24003
# https://doi.org/10.1093/qje/qjac038
coeftest(reg1, vcov = vcovCL, cluster = ~locality_identifier)

# Now, add household-level controls
reg2 = lm(health_expenditures ~ enrolled*round + age_hh + age_sp + educ_hh + educ_sp + female_hh + indigenous + hhsize + dirtfloor + bathroom + land + hospital_distance, data = df2)
summary(reg2)
coeftest(reg2, vcov = vcovCL, cluster = ~locality_identifier)